The main Figure from Nature Paper

The main Figure from Nature Paper

Arumugam M, Raes J, Pelletier E, Le Paslier D, Yamada T, Mende DR, Fernandes GR, Tap J, Bruls T, Batto JM, Bertalan M. … Enterotypes of the human gut microbiome. Nature. 2011;473(7346):174-80.

library("phyloseq")
library("cluster")
data("enterotype")
ps_data = subset_samples(enterotype, SeqTech == "Sanger")
ps_data
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 553 taxa and 41 samples ]
## sample_data() Sample Data:       [ 41 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 553 taxa by 1 taxonomic ranks ]
otu_table(ps_data)[1:5,1:9]
## OTU Table:          [5 taxa and 9 samples]
##                      taxa are rows
##                    AM.AD.1    AM.AD.2  AM.F10.T1 AM.F10.T2  DA.AD.1
## -1               0.4265035 0.33226194 0.48195467 0.3885831 0.533604
## Bacteria         0.0000000 0.00000000 0.00000000 0.0000000 0.000000
## Prosthecochloris 0.0000000 0.00000000 0.00000000 0.0000000 0.000000
## Chloroflexus     0.0000000 0.00000000 0.00000000 0.0000000 0.000000
## Dehalococcoides  0.0000000 0.00026835 0.00000959 0.0000000 0.000000
##                    DA.AD.1T    DA.AD.2   DA.AD.3  DA.AD.3T
## -1               0.62499134 0.63470800 0.7798098 0.8052677
## Bacteria         0.00000000 0.00001328 0.0000000 0.0000000
## Prosthecochloris 0.00000000 0.00000000 0.0000000 0.0000000
## Chloroflexus     0.00000081 0.00000000 0.0000000 0.0000000
## Dehalococcoides  0.00000000 0.00002030 0.0000000 0.0000000
apply(otu_table(ps_data),2,sum)
##   AM.AD.1   AM.AD.2 AM.F10.T1 AM.F10.T2   DA.AD.1  DA.AD.1T   DA.AD.2 
## 0.9999999 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   DA.AD.3  DA.AD.3T   DA.AD.4   ES.AD.1   ES.AD.2   ES.AD.3   ES.AD.4 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   FR.AD.1   FR.AD.2   FR.AD.3   FR.AD.4   FR.AD.5   FR.AD.6   FR.AD.7 
## 1.0000000 1.0000000 0.9999999 1.0000000 1.0000000 1.0000000 1.0000000 
##   FR.AD.8   IT.AD.1   IT.AD.2   IT.AD.3   IT.AD.4   IT.AD.5   IT.AD.6 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000001 0.9999999 
##   JP.AD.1   JP.AD.2   JP.AD.3   JP.AD.4   JP.AD.5   JP.AD.6   JP.AD.7 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   JP.AD.8   JP.AD.9   JP.IN.1   JP.IN.2   JP.IN.3   JP.IN.4 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
newsampledata=sample_data(ps_data)
newsampledata$Enterotype=addNA(newsampledata$Enterotype)
sample_data(ps_data)=newsampledata
table(sample_data(ps_data)$Enterotype)
## 
##    1    2    3 <NA> 
##    8    6   18    9
levels(sample_data(ps_data)$Enterotype) =c("1","2","3","0")
table(sample_data(ps_data)$Enterotype)
## 
##  1  2  3  0 
##  8  6 18  9

This has 41 samples, 9 of which were mysteriously dropped from the enterotype study, we are going to re-include them with the Enterotype classifier “0”.

data = as(otu_table(ps_data), "matrix")
data = data[-1, ]

Choose a distance (Jensen-Shannon symmetrized KL divergence)

data.dist = dist.JSD(data)

Ordination

PCOA(principal coordinates analysis or multidimen. scaling)

require(ggplot2)
## Loading required package: ggplot2
#Standard Multidimensional Scaling
ent.jsd=ordinate(ps_data,method="MDS",distance=data.dist)

p <- NULL
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.jsd, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("MDS using JSD distance  ")
p

Notice the difference with

p + coord_fixed()

NonMetric Multidimensional Scaling

#NonMetric Multidimensional Scaling
ent.jsd.nm=ordinate(ps_data,method="NMDS",distance=data.dist)
## Run 0 stress 0.1495351 
## Run 1 stress 0.1486988 
## ... New best solution
## ... Procrustes: rmse 0.01729053  max resid 0.09053615 
## Run 2 stress 0.1490212 
## ... Procrustes: rmse 0.01190292  max resid 0.06869235 
## Run 3 stress 0.158822 
## Run 4 stress 0.1810162 
## Run 5 stress 0.3983574 
## Run 6 stress 0.195177 
## Run 7 stress 0.1486994 
## ... Procrustes: rmse 0.0003406454  max resid 0.001676446 
## ... Similar to previous best
## Run 8 stress 0.1906354 
## Run 9 stress 0.1463769 
## ... New best solution
## ... Procrustes: rmse 0.09804789  max resid 0.443149 
## Run 10 stress 0.1588739 
## Run 11 stress 0.1530669 
## Run 12 stress 0.1463768 
## ... New best solution
## ... Procrustes: rmse 0.0001023039  max resid 0.0004768064 
## ... Similar to previous best
## Run 13 stress 0.1612107 
## Run 14 stress 0.145358 
## ... New best solution
## ... Procrustes: rmse 0.1142685  max resid 0.3282707 
## Run 15 stress 0.1486998 
## Run 16 stress 0.1486995 
## Run 17 stress 0.1528798 
## Run 18 stress 0.1453579 
## ... New best solution
## ... Procrustes: rmse 0.0001545114  max resid 0.0006215782 
## ... Similar to previous best
## Run 19 stress 0.1453593 
## ... Procrustes: rmse 0.000325628  max resid 0.001447927 
## ... Similar to previous best
## Run 20 stress 0.1588225 
## *** Solution reached
p <- NULL
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.jsd.nm, color = "Enterotype", 
                     shape = "Enterotype")
# Add title to each plot
p <- p + ggtitle("Non Metric MDS using JSD distance  ")
p

Correspondence Analysis

#Correspondence Analysis
ent.ca = ordinate(ps_data, method = "CCA", distance = NULL)

p <- NULL
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.ca, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("Correspondence Analysis (chisquare distance)")
p 

Bray Curtis

#Bray-Curtis
ent.bray = ordinate(ps_data, method = "MDS", distance = "bray")
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.bray, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("Bray Curtis Distance, MDS")
p +coord_fixed()

Nonmetric MDS

#Bray-Curtis
ent.bray.nm = ordinate(ps_data, method = "NMDS", distance = "bray")
## Run 0 stress 0.1196547 
## Run 1 stress 0.1198836 
## ... Procrustes: rmse 0.05939135  max resid 0.2650939 
## Run 2 stress 0.1110788 
## ... New best solution
## ... Procrustes: rmse 0.03756215  max resid 0.2268392 
## Run 3 stress 0.1110441 
## ... New best solution
## ... Procrustes: rmse 0.001621643  max resid 0.007368474 
## ... Similar to previous best
## Run 4 stress 0.1198812 
## Run 5 stress 0.1110444 
## ... Procrustes: rmse 6.390216e-05  max resid 0.0002548962 
## ... Similar to previous best
## Run 6 stress 0.1287199 
## Run 7 stress 0.111046 
## ... Procrustes: rmse 0.0003098427  max resid 0.001199108 
## ... Similar to previous best
## Run 8 stress 0.1110457 
## ... Procrustes: rmse 0.0002729902  max resid 0.001012941 
## ... Similar to previous best
## Run 9 stress 0.1110443 
## ... Procrustes: rmse 0.0008632665  max resid 0.003322182 
## ... Similar to previous best
## Run 10 stress 0.1110774 
## ... Procrustes: rmse 0.001278632  max resid 0.006131806 
## ... Similar to previous best
## Run 11 stress 0.1110437 
## ... New best solution
## ... Procrustes: rmse 0.0006912448  max resid 0.002720524 
## ... Similar to previous best
## Run 12 stress 0.1110444 
## ... Procrustes: rmse 9.314308e-05  max resid 0.0003646597 
## ... Similar to previous best
## Run 13 stress 0.1198293 
## Run 14 stress 0.119647 
## Run 15 stress 0.1196542 
## Run 16 stress 0.111044 
## ... Procrustes: rmse 0.0001740048  max resid 0.0008994936 
## ... Similar to previous best
## Run 17 stress 0.1244997 
## Run 18 stress 0.1196559 
## Run 19 stress 0.1245093 
## Run 20 stress 0.1196479 
## *** Solution reached
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.bray.nm, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("Bray Curtis Distance, MDS")
p

See more explorations of the data at the phyloseq vignette

Remove the outliers and redo the analyses

Take out the samples that were ignored in the paper:

###beware new name for different data set
ps_nata = subset_samples(ps_data, Enterotype !="0")
nata = as(otu_table(ps_nata), "matrix")
nata = nata[-1, ]
nata.dist = dist.JSD(nata)

A complete loop through all distances

dist_methods <- unlist(distanceMethodList)
###Remove distances requiring a tree
dist_methods <- dist_methods[-c(1:3,16)]
# Remove the user-defined distance
dist_methods = dist_methods[-which(dist_methods == "ANY")]
plist <- vector("list", length(dist_methods))
names(plist) = dist_methods
for (i in dist_methods) {
    # Calculate distance matrix
    iDist <- distance(ps_nata, method = i)
    # Calculate ordination
    iMDS <- ordinate(ps_nata, "MDS", distance = iDist)
    ## Make plot Don't carry over previous plot (if error, p will be blank)
    p <- NULL
    # Create plot, store as temp variable, p
    p <- plot_ordination(ps_nata, iMDS, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle(paste("MDS using distance method ", i, sep = ""))
    # Save the graphic to file.
    plist[[i]] = p
}
require(plyr)
df = ldply(plist, function(x) x$data)
names(df)[1] <- "distance"
p = ggplot(df, aes(Axis.1, Axis.2, color = Enterotype, shape = Enterotype))
p = p + geom_point(size = 3, alpha = 0.5)
p = p + facet_wrap(~distance, scales = "free")
p = p + ggtitle("MDS on various distance metrics for Enterotype dataset")
p

With a different clustering variable

(It may be more interesting)

theme_set(theme_bw())
data(enterotype)
enterotype <- subset_taxa(enterotype, Genus != "-1")
df = ldply(plist, function(x) x$data)
names(df)[1] <- "distance"
p = ggplot(df, aes(Axis.1, Axis.2, color = SeqTech, shape = Enterotype))
p = p + geom_point(size = 3, alpha = 0.5)
p = p + facet_wrap(~distance, scales = "free")
p = p + ggtitle("MDS on various distance metrics for Enterotype dataset")
p
## Warning: Removed 378 rows containing missing values (geom_point).

How many clusters?

pam.clustering = function(x, k) {
    # x is a distance matrix and k the number of clusters
    require(cluster)
    cluster = as.vector(pam(as.dist(x), k, diss = TRUE)$clustering)
    return(cluster)
}

The authors fixed the number of clusters to 3, would we?

### The following line is already asking for 3 clusters...
data.cluster = pam.clustering(nata.dist, k = 3)
require(clusterSim)
## Loading required package: clusterSim
## Loading required package: MASS
nclusters = index.G1(t(nata), data.cluster, d = nata.dist, centrotypes = "medoids")
nclusters = NULL
for (k in 1:20) {
    if (k == 1) {
        nclusters[k] = NA
    } else {
        data.cluster_temp = pam.clustering(nata.dist, k)
        nclusters[k] = index.G1(t(nata), data.cluster_temp, d = nata.dist, centrotypes = "medoids")
    }
}
plot(nclusters, type = "h", xlab = "k clusters", ylab = "CH index", 
    main = "Optimal number of clusters (Calinsky and Harabsz 1974)")

or we could use the Gap Statistic:

exord = ordinate(ps_nata, method = "MDS", distance = "jsd")
print(gs, method = "Tibs2001SEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = x[, axes], FUNcluster = FUNcluster, K.max = K.max,     B = B, verbose = verbose)
## B=50 simulated reference sets, k = 1..6; spaceH0="scaledPCA"
##  --> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 1
##            logW     E.logW       gap     SE.sim
## [1,] -0.1063849  0.2030794 0.3094643 0.06965674
## [2,] -0.4072377 -0.1443816 0.2628562 0.07420011
## [3,] -0.7708693 -0.4291677 0.3417015 0.06144951
## [4,] -0.9462297 -0.6532264 0.2930034 0.06490076
## [5,] -1.1250085 -0.8378713 0.2871372 0.06525161
## [6,] -1.3168929 -0.9932512 0.3236417 0.07782890
plot_clusgap(gs)

Ordination as done in Nature Paper: drop 9 outliers

require(ade4)
## Loading required package: ade4
data = as(otu_table(ps_data), "matrix")
data = data[-1, ]
clus=sample_data(ps_data)$Enterotype
data123=data[,-which(clus==0)]
data.dist123=dist.JSD(data123)
obs.pcoa=dudi.pco(data.dist123,scannf=F,nf=3)
s.class(obs.pcoa$li, grid=F,fac=clus[-which(clus==0)])

Compare to figure in the paper

EnterotypeFigure

EnterotypeFigure

This was actually made by creating new axes by using a supervised method (between group analyses) to polarize the clustering effect.

Abritrary Choice of Underlying structure?

Is a Gradient better?

Could have tested the ordering gradient and found it significant, why use an underlying categorical variable?

Is the underlying variable a categorical one?

dist123.mst=mstree(data.dist123,1)
s.label(obs.pcoa$li, clab = 0, cpoi = 2, neig = dist123.mst, cnei = 1)

We could clean up the graphical representation:

obs.pcoa2=dudi.pco(data.dist123,scannf=F,nf=2)
pcoa.mst=mstree(dist(obs.pcoa2$li),1)
s.label(obs.pcoa2$li, cpoi = 2, neig = pcoa.mst, cnei = 1,label=paste(newvar),boxes=F)

You could even test the gradient

require(vegan)
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-5
newvarm=as.matrix(newvar)
res=adonis(data.dist123~newvarm,perm=99999)
res
## 
## Call:
## adonis(formula = data.dist123 ~ newvarm, permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## newvarm    1   0.49049 0.49049  10.688 0.26268  1e-05 ***
## Residuals 30   1.37676 0.04589         0.73732           
## Total     31   1.86725                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Beware This is not a valid way of doing analyses, ie look at the data, then test on the same data.

How many ways ??

How many different choices did we make along the way, let’s count:
- Choose the data transformation (here proportions replaced the original counts). … log, rlog, subsample, prop, orig. - Take a subset of the data, some samples declared as outliers.
… leave out 0, 1, 2 ,..,9, + criteria (10)……
- Filter out certain taxa (unknown labels, rare, etc…)
… remove rare taxa (threshold at 0.01%, 1%, 2%,…)
- Choose a distance.
… 40 choices in vegan/phyloseq.
- Choose an ordination method and number of coordinates.
… MDS, NMDS, k=2,3,4,5..
- Choose a clustering method, choose a number of clusters.
… PAM, KNN, density based, hclust …
- Choose an underlying continuous variable (gradient or group of variables: manifold).
- Choose a graphical representation.

One answer:

\[5\times 100 \times 10 \times 40 \times 8 \times 16 \times 2 \times 4 = 204,800,000\]